#!/usr/bin/env python
# coding: utf8
# -*- coding: utf-8 -*-
# Quelle: https://rstudio-pubs-static.s3.amazonaws.com/79360_850b2a69980c4488b1db95987a24867a.html

'''

NLTK, a natural language toolkit for Python. A useful package for any natural language processing.
For Mac/Unix with pip: $ sudo pip install -U nltk
stop_words, a Python package containing stop words.
For Mac/Unix with pip: $ sudo pip install stop-words
gensim, a topic modeling package containing our LDA model.
For Mac/Unix with pip: $ sudo pip install gensim

'''
import os
os.environ["PATH"] = "/usr/local/lib/python2.7/dist-packages/gensim#"
from nltk.tokenize import RegexpTokenizer
from stop_words import get_stop_words
from nltk.stem.porter import PorterStemmer
from gensim import corpora, models
import gensim
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import sys
import codecs
sys.path.append("/usr/local/lib/python2.7/")








def deumlaut(s):
    """
    Replaces umlauts with fake-umlauts
    """

    s = s.replace('\xdf', 'ss')
    s = s.replace('\xfc', '&uuml;')
    s = s.replace('\xdc', '&Uuml;')
    s = s.replace('\xf6', '&ouml;')
    s = s.replace('\xd6', '&Ouml;')
    s = s.replace('\xe4', '&auml;')
    s = s.replace('\xc4', '&Auml;')

    s = s.replace('ö', 'oe')
    s = s.replace('ä', 'ae')
    s = s.replace('ü', 'ue')
    s = s.replace('Ü', 'Ue')
    s = s.replace('Ö', 'Oe')
    s = s.replace('Ä', 'Ae')

    #s = s.replace('\xdf', 'ss')
    #s = s.replace('\xfc', 'ue')
    #s = s.replace('\xdc', 'Ue')
    #s = s.replace('\xf6', 'oe')
    #s = s.replace('\xd6', 'Oe')
    #s = s.replace('\xe4', 'ae')
   # s = s.replace('\xc4', 'Ae')
    return s












tokenizer = RegexpTokenizer(r'\w+')

# create English stop words list
de_stop = get_stop_words('german')

# Create p_stemmer of class PorterStemmer
p_stemmer = PorterStemmer()

filename = sys.argv[1]
f = codecs.open(filename, "r", "utf-8")
text_unicode = f.read().encode("utf-8")

text_unicode = deumlaut(text_unicode)

'''
# create sample documents
doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother."
doc_b = "My mother spends a lot of time driving my brother around to baseball practice."
doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure."
doc_d = "I often feel pressure to perform well at school, but my mother never seems to drive my brother to do better."
doc_e = "Health professionals say that brocolli is good for your health."







test_text = "Ein rosa Kleidchen, ein farblich passendes Strohhütchen auf dem Kopf, ein breites Lächeln: Carmen Geiss " \
            "(50), wie man sie kennt. Für dieses Foto kassiert die Kölner Kult-Millionärin gerade allerdings einen " \
            "üblen Shitstorm. Der Grund: Die Urlauberpose hat Carmen in einem kolumbianischen Armenviertel aufgenommen. Dazu die Facebook-Erklärung: „HEUTE GEHT ES MAL IN DIE SLUMS VON CARTAGENA“ Neben Carmens schrillem Outfit sorgt auch die Anreise der Geissens für Empörung: Die Millionärs-Familie legt im Luxus-Bötchen, das den Namen „Roberto Geissini“ trägt, im Hafen der Armen an."

'''
#text_unicode = test_text.decode('utf-8')

doc_set = sent_tokenize(text_unicode.decode("utf8"))



# compile sample documents into a list
#doc_set = [doc_a, doc_b, doc_c, doc_d, doc_e]

# list for tokenized documents in loop
texts = []

# loop through document list
for i in doc_set:

    # clean and tokenize document string
    raw = i.lower()
    tokens = tokenizer.tokenize(raw)

    # remove stop words from tokens
    stopped_tokens = [i for i in tokens if not i in de_stop]

    # stem tokens
    stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens]

    # add tokens to list
    texts.append(stemmed_tokens)

# turn our tokenized documents into a id <-> term dictionary
dictionary = corpora.Dictionary(texts)

# convert tokenized documents into a document-term matrix
corpus = [dictionary.doc2bow(text) for text in texts]

# generate LDA model
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=5, id2word=dictionary, passes=128, alpha='auto',
                                           eval_every=5)

'''
https://radimrehurek.com/gensim/models/ldamodel.html
LDA module können trainiert werden - YEEEEEESSS
'''


print("<br><br>LDAPROFILER OUTPUT: ")
print(ldamodel.print_topics(num_topics=2, num_words=2))